library(mosaic)
library(tidyverse)
library(pander)
library(DT)
library(ggrepel)
library(plotly)
library(dplyr)
library(ggplot2)
library(maps)
library(tmap)
library(leaflet)
library(htmltools)
library(car)
library(mosaicData)
library(ResourceSelection)
library(reshape2)
library(RColorBrewer)
library(scatterplot3d)
library(readr)
library(prettydoc)
library(knitr)
library(kableExtra)
library(formattable)
library(haven)
library(reshape2)
library(GGally)
In this study, we will be exploring every aspect of residential homes in Ames, Iowa in order to predict the final price of each home.
Below, I start by mutating and determining which variables to utilize. Click the tab below to see that exploration.
First, we’ll take a look at the pairs plot.
Alright, I’ll say it first, this thing is HUGE!! So, lets first consider some main criteria when it comes to picking a home and choose those specific variables to look at in the pairs plot.
When choosing variables, I had 3 main criteria I wanted to hit:
With that, I chose these variables to look at the fit my criteria.
train <- read.csv("train.csv", stringsAsFactors = TRUE)
pairs(train [c("SalePrice", "GrLivArea","OverallQual", "TotalBsmtSF", "GarageCars", "GarageArea", "X1stFlrSF", "X2ndFlrSF", "Neighborhood", "YearBuilt","YearRemodAdd")])
In order to fit the data with those criteria in mind, I mutated the data to fit more columns into our model. I created these new columns:
TotalSF : The total surface area of the house including
the all floors of the house (first, second, and basement) and the
garageLocationScore : captures the location
quality based on two factors
UtilityScore : based on the home’s
usability
TimeRemodel: The Year it was Sold - The Year it was
Remodeled = Shows how many years have passed since it
was last remodeled up to the year it sold
OverallScore: the average rating of
the overall condition and overall quality/finish of the homeOverall, here are the variables I chose to use and what they can tell us in terms of this study.
# Load necessary library
library(knitr)
# Create a data frame with Variable, Description, and How it Helps Us
table_data <- data.frame(
Variable = c("SalePrice", "TotalSF", "LocationScore", "UtilityScore", "TimeRemodel",
"Neighborhood", "OverallScore", "Neighborhood:TotalSF"),
Description = c("The final price at which the house was sold.",
"Total square footage of the house, including basement and garage.",
"A score that evaluates the desirability of the neighborhood and location conditions.",
"A score representing the house’s overall utility, considering space, features, and livability.",
"Number of years since the last remodeling or addition was completed.",
"The specific neighborhood in which the house is located.",
"An average of Overall Quality and Overall Condition ratings.",
"An interaction term that accounts for how the effect of total square footage varies across neighborhoods."),
How_it_Helps_Us = c("Target variable we are trying to predict.",
"Bigger houses generally sell for more, making this a key predictor.",
"Homes in desirable locations tend to have higher sale prices.",
"Higher utility scores indicate more livable homes, increasing value.",
"More recently remodeled homes tend to sell for higher prices.",
"Neighborhood greatly influences home values due to amenities and demand.",
"Houses with better quality and condition typically sell for more.",
"Captures how the impact of house size varies depending on the neighborhood.")
)
# Print the table in a markdown-friendly format
kable(table_data, format = "markdown", col.names = c("Variable", "What it Looks at", "How it Helps Us"))
| Variable | What it Looks at | How it Helps Us |
|---|---|---|
| SalePrice | The final price at which the house was sold. | Target variable we are trying to predict. |
| TotalSF | Total square footage of the house, including basement and garage. | Bigger houses generally sell for more, making this a key predictor. |
| LocationScore | A score that evaluates the desirability of the neighborhood and location conditions. | Homes in desirable locations tend to have higher sale prices. |
| UtilityScore | A score representing the house’s overall utility, considering space, features, and livability. | Higher utility scores indicate more livable homes, increasing value. |
| TimeRemodel | Number of years since the last remodeling or addition was completed. | More recently remodeled homes tend to sell for higher prices. |
| Neighborhood | The specific neighborhood in which the house is located. | Neighborhood greatly influences home values due to amenities and demand. |
| OverallScore | An average of Overall Quality and Overall Condition ratings. | Houses with better quality and condition typically sell for more. |
| Neighborhood:TotalSF | An interaction term that accounts for how the effect of total square footage varies across neighborhoods. | Captures how the impact of house size varies depending on the neighborhood. |
train <- train %>%
mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF + GarageArea) %>%
mutate(TotalRoom = FullBath + (HalfBath * 0.5) + BsmtFullBath + (BsmtHalfBath * 0.5) + KitchenAbvGr + BedroomAbvGr ) %>% # total amount of rooms in the house (bedrooms, bathrooms, etc.)
mutate(
Utilities_score = case_when(
Utilities == "AllPub" ~ 4,
Utilities == "NoSewr" ~ 3,
Utilities == "NoSeWa" ~ 2,
Utilities == "ELO" ~ 1,
TRUE ~ 0
),
Street_score = case_when(
Street == "Pave" ~ 1,
Street == "Grvl" ~ 0,
TRUE ~ 0
),
Alley_score = case_when(
Alley == "Pave" ~ 2,
Alley == "Grvl" ~ 1,
Alley == "NA" ~ 0,
TRUE ~ 0
),
LandSlope_score = case_when(
LandSlope == "Gtl" ~ 2,
LandSlope == "Mod" ~ 1,
LandSlope == "Sev" ~ 0,
TRUE ~ 0
),
CentralAir_score = ifelse(CentralAir == "Y", 1, 0),
PavedDrive_score = case_when(
PavedDrive == "Y" ~ 2,
PavedDrive == "P" ~ 1,
PavedDrive == "N" ~ 0,
TRUE ~ 0
),
OverallQual_norm = OverallQual / 10, # Scale from 1-10
OverallCond_norm = OverallCond / 10,
HeatingQC_score = case_when(
HeatingQC == "Ex" ~ 5,
HeatingQC == "Gd" ~ 4,
HeatingQC == "TA" ~ 3,
HeatingQC == "Fa" ~ 2,
HeatingQC == "Po" ~ 1,
TRUE ~ 0
),
KitchenQual_score = case_when(
KitchenQual == "Ex" ~ 5,
KitchenQual == "Gd" ~ 4,
KitchenQual == "TA" ~ 3,
KitchenQual == "Fa" ~ 2,
KitchenQual == "Po" ~ 1,
TRUE ~ 0
),
Functional_score = case_when(
Functional == "Typ" ~ 5,
Functional == "Min1" ~ 4,
Functional == "Min2" ~ 3,
Functional == "Mod" ~ 2,
Functional == "Maj1" ~ 1,
Functional == "Maj2" ~ 0,
TRUE ~ 0
)
) %>%
mutate(
UtilityScore = (0.15 * Utilities_score) +
(0.10 * GrLivArea) +
(0.07 * TotalBsmtSF) +
(0.06 * GarageArea) +
(0.05 * KitchenQual_score) +
(0.05 * HeatingQC_score) +
(0.05 * Functional_score) +
(0.04 * PavedDrive_score) +
(0.03 * Alley_score) +
(0.02 * Street_score) +
(0.02 * LandSlope_score) +
(0.02 * CentralAir_score) +
(0.05 * WoodDeckSF) +
(0.05 * OpenPorchSF)
) %>%
mutate( # scores based on popularity fo outside look!
HouseStyle = as.character(HouseStyle),
HouseStyle = replace_na(HouseStyle, "None"),
HouseStyle = as.factor(HouseStyle),
HouseStyle_Score = case_when( # scored on popularity
HouseStyle == "2.5Fin" ~ 8,
HouseStyle == "2Story" ~ 7,
HouseStyle == "1Story" ~ 6,
HouseStyle == "SLvl" ~ 5,
HouseStyle == "2.5Unf" ~ 4,
HouseStyle == "1.5Fin" ~ 3,
HouseStyle == "SFoyer" ~ 2,
HouseStyle == "1.5Unf" ~ 1
),
LotShape = as.character(LotShape),
LotShape = replace_na(LotShape, "None"),
LotShape = as.factor(LotShape),
LotShape_Score = case_when(
LotShape == "Reg" ~ 4,
LotShape == "IR1" ~ 3,
LotShape == "IR2" ~ 2,
LotShape == "IR3" ~ 1
),
ExterQual = as.character(ExterQual),
ExterQual = as.factor(ExterQual),
ExterQual_Score = case_when(
ExterQual == "Ex" ~ 5,
ExterQual == "Gd" ~ 4,
ExterQual == "TA" ~ 3,
ExterQual == "Fa" ~ 2,
ExterQual == "Po" ~ 1
),
ExterCond = as.character(ExterCond),
ExterCond = as.factor(ExterCond),
ExterCond_Score = case_when(
ExterCond == "Ex" ~ 5,
ExterCond == "Gd" ~ 4,
ExterCond == "TA" ~ 3,
ExterCond == "Fa" ~ 2,
ExterCond == "Po" ~ 1)
) %>%
mutate(OverallScore = (OverallQual + OverallCond)/2) %>%
mutate(
LocationScore = case_when(
Neighborhood %in% c("NoRidge", "NridgHt", "StoneBr", "Veenker") ~ 5,
Neighborhood %in% c("NWAmes", "Somerst", "Timber", "ClearCr") ~ 4,
Neighborhood %in% c("Sawyer", "SawyerW", "Edwards", "BrkSide") ~ 2,
TRUE ~ 3
) + case_when(
Condition1 %in% c("PosN", "PosA") | Condition2 %in% c("PosN", "PosA") ~ 2,
Condition1 %in% c("Artery", "Feedr", "RRAn", "RRNe") | Condition2 %in% c("Artery", "Feedr", "RRAn", "RRNe") ~ -1,
TRUE ~ 0
)
) %>%
mutate(PopularNbrHd = case_when(Neighborhood %in% c("NAmes", "CollgCr", "OldTown", "Edwards", "Somerst", "Gilbert", "NridgHt", "Sawyer", "NWAmes", "SawyerW" )~ 1,
TRUE ~ 0)) %>%
mutate(TimeRemodel = YrSold - YearRemodAdd) %>%
mutate(OutdoorScore = HouseStyle_Score + LotShape_Score + ExterQual_Score + ExterCond_Score)
house.lm <- lm(SalePrice ~ TotalSF + LocationScore + UtilityScore + TimeRemodel + Neighborhood + Neighborhood:TotalSF + OverallScore, data=train)
pairs(train [c("SalePrice", "TotalSF", "TimeRemodel", "UtilityScore", "Neighborhood", "LocationScore", "OverallScore")],panel=panel.smooth)
The visuals below will look at how each variable effect and interact
each other when it comes to predicting SalePrice.
Some of them at a glance will be difficult to read, thus a subset of each graph will be given to look at each factor individually. Click through the tabs to see each visual.
Key Findings:
TotalSF has on
SalesPricesSalePrices at any given TotalSF(ex.
StoneBr)TSA.N <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(Neighborhood))) +
geom_point() +
geom_line(aes(y = house.lm$fit, group = interaction(LocationScore)), cex = 0.5) +
theme_minimal()
ggplotly(TSA.N)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(Neighborhood))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~Neighborhood)
Key Findings:
LocationScore categories, they
all seem to be within the same range of square footage (they all look a
big more clustered within a specific range, despite some outliers)LocationScore 5 has a steeper slope, thus the price
increases more rapidly as square footage increases in comparison to
LocationScore 4TSA.LS <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(TSA.LS)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~LocationScore)
Key Findings:
OverallScore values between 3-7 that as the quality
of a home improves, the SalePrice per TotalSF
increases prices of homes
OverallScore value of 7.5, there are
instances where a 7.5 ranking house with a large square footage sold
very low and a lower square footage home sold very high, so this graph’s
inperpretation is a bit confusingTSA.OS <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(OverallScore))) +
geom_point(size=1) +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(TSA.OS)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(OverallScore))) +
geom_point(size=1) +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~OverallScore)
Key Findings:
Neighorhood categories (Blmngtn, NridgHt,
Somerst, etc.) show that their homes are renovated at lot more recently
based on how small their TimeRemodel values are
SalePriceTSR.N <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(Neighborhood))) +
geom_point(size=1) +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(TSR.N)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(Neighborhood))) +
geom_point(size=1) +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~Neighborhood)
Key Findings:
LocationScore of 5, the more recent something
is renovated, the SalePrice increases based of that
desirable location
SalePrice decreases based on that locationLocationScore scores show that
regardless of how late or how recent the home was renovated, the
SalePrice of the home stays fairly consistent or is a
slight decrease, despite the LocationScoreTSR.LS <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(TSR.LS)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~LocationScore)
Key Findings:
Here’s a clearer way to express those observations:
SalePrice patterns mirror what we observed in the
LocationScore graphs for homes with
OverallScore ratings between 4-6
SalePrice, regardless of the home’s
OverallScoreOverallScore ratings, even when comparing recently and
previously renovated properties
TSR.OS <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(OverallScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(TSR.OS)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(OverallScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~OverallScore)
Key Findings:
Neighborhood (StoneBr, NridgHt, etc.)
show a steeper slope, showing that the higher the
UtilityScore results in a higher
SalePrice (expensive places have more things)
UtilityScore and thus a smaller SalePrice
(cheaper places don’t have that much stuff)U.N <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(Neighborhood))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(U.N)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(Neighborhood))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~Neighborhood)
Key Findings:
UtilityScore seems to be prioritized over
LocationScore
UtilityScore increases as well as the
SalePriceU.LS <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(U.LS)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(LocationScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~LocationScore)
Key Findings:
OverallScore goes from smallest to largest, we
can see that the increased UtilityScore increases along
with the SalePrice
SalePrice as UtilityScore
increases with every increase of OverallScoreU.OS <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(OverallScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal()
ggplotly(U.OS)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(OverallScore))) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
theme_minimal() +
facet_wrap(~OverallScore)
Now, we will put the model to use through testing as well as interpretation.
This is the mathematical model of which my regression model is based off of. The following shows:
\[ \underbrace{Y_i}_{SalePrice} = \beta_0 + \beta_1\underbrace{X_{i1}}_{TotalSF}+ \beta_2\underbrace{X_{i2}}_{LocationScore} + \beta_3\underbrace{X_{i3}}_{UtilityScore} + \beta_4\underbrace{X_{i4}}_{TimeRemodel} + \beta_5\underbrace{X_{i5}}_{Neighborhood} + \beta_6\underbrace{X_{i6}}_{OverallScore} + \beta_7\underbrace{X_{i5}X_{i1}}_{Neighborhood:TotalSF} + \epsilon_i \text{ where N(0, }\sigma^2) \]
Neighborhood, LocationScore, and
OverallScore within the model.Proceed to the next tab to see our results
After looking at this regression, the most significant variables are:
LocationScore(0.001629) : As the location quality
increases, the predicted sale price of a home would increase by $5480
per location score.
UtilityScore(7.724e-16) : As quality and quantity of
utilities increase, the predicted sale price of a home would increase by
$525.90 per utility score.
TimeRemodel (1.46e-05) : As the increase in year
since remodeling/ renovating a home, the predicted sale price would
decrease $213.10 per year.
OverallScore(1.221e-48) : As the overall quality and
condition of a home increases, the sale price of a home would increase
by $16951 per overall score.
NeighborhoodStoneBr(0.04949) : When selling the
houses in the Stone Brooke Neighborhood, they sell $164471 less compared
to average neighborhood. Therefore, other neighborhoods might be more
desirable or houses in this neighborhood could be more open to
bargaining.
TotalSF:NeighborhoodEdwards (0.02362) : For each
additional surface area of house in the Edwards neighborhood, the sale
price decreases $55.22 per surface area. (This due to the result of a
house big house [13170 \(f^2\) costing
$160,000] costing less than smaller houses that cost a bigger amount of
money)
TotalSF:NeighborhoodStoneBr (0.03662) : For each
additional surface area of house in the Stone Brooke neighborhood, the
sale price increases by $52.54 per surface area. While the
neighborhood’s houses itself can become fairly cheap, that still doesn’t
take away from the fact that when house size is involved that sale
prices can increase.
The individual results will vary by as much as 53636.
The insignificance of some of these variables could be due to:
# Original (train data)
houseO.lm <- lm(SalePrice ~ TotalSF + as.factor(LocationScore) + UtilityScore + TimeRemodel + as.factor(Neighborhood) + as.factor(OverallScore) + Neighborhood:TotalSF , data=train)
summary(houseO.lm) %>% pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -92979 | 82601 | -1.126 | 0.2605 |
| TotalSF | 23.1 | 24.51 | 0.9424 | 0.3462 |
| as.factor(LocationScore)2 | 13100 | 4186 | 3.129 | 0.001789 |
| as.factor(LocationScore)3 | 19107 | 5264 | 3.63 | 0.0002939 |
| as.factor(LocationScore)4 | 17613 | 7427 | 2.372 | 0.01785 |
| as.factor(LocationScore)5 | 24780 | 8571 | 2.891 | 0.003898 |
| as.factor(LocationScore)6 | 24724 | 11797 | 2.096 | 0.03629 |
| as.factor(LocationScore)7 | -25296 | 28562 | -0.8857 | 0.376 |
| UtilityScore | 465.6 | 64.51 | 7.217 | 8.721e-13 |
| TimeRemodel | -208.4 | 48.7 | -4.278 | 2.013e-05 |
| as.factor(Neighborhood)Blueste | 16111 | 212177 | 0.07593 | 0.9395 |
| as.factor(Neighborhood)BrDale | 37853 | 90115 | 0.42 | 0.6745 |
| as.factor(Neighborhood)BrkSide | 61898 | 79202 | 0.7815 | 0.4346 |
| as.factor(Neighborhood)ClearCr | 50776 | 84841 | 0.5985 | 0.5496 |
| as.factor(Neighborhood)CollgCr | 30332 | 78597 | 0.3859 | 0.6996 |
| as.factor(Neighborhood)Crawfor | 48508 | 79432 | 0.6107 | 0.5415 |
| as.factor(Neighborhood)Edwards | 138769 | 78279 | 1.773 | 0.07649 |
| as.factor(Neighborhood)Gilbert | 37231 | 80184 | 0.4643 | 0.6425 |
| as.factor(Neighborhood)IDOTRR | 18342 | 80963 | 0.2266 | 0.8208 |
| as.factor(Neighborhood)MeadowV | 80167 | 80315 | 0.9982 | 0.3184 |
| as.factor(Neighborhood)Mitchel | 64982 | 79508 | 0.8173 | 0.4139 |
| as.factor(Neighborhood)NAmes | 85739 | 78315 | 1.095 | 0.2738 |
| as.factor(Neighborhood)NoRidge | -141931 | 80776 | -1.757 | 0.07912 |
| as.factor(Neighborhood)NPkVill | 96151 | 184902 | 0.52 | 0.6031 |
| as.factor(Neighborhood)NridgHt | -103230 | 80188 | -1.287 | 0.1982 |
| as.factor(Neighborhood)NWAmes | 80122 | 80435 | 0.9961 | 0.3194 |
| as.factor(Neighborhood)OldTown | 61370 | 78514 | 0.7817 | 0.4346 |
| as.factor(Neighborhood)Sawyer | 89864 | 79401 | 1.132 | 0.2579 |
| as.factor(Neighborhood)SawyerW | 41832 | 79331 | 0.5273 | 0.5981 |
| as.factor(Neighborhood)Somerst | 16677 | 79690 | 0.2093 | 0.8343 |
| as.factor(Neighborhood)StoneBr | -156328 | 82479 | -1.895 | 0.05825 |
| as.factor(Neighborhood)SWISU | 73287 | 80895 | 0.9059 | 0.3651 |
| as.factor(Neighborhood)Timber | 1446 | 81612 | 0.01772 | 0.9859 |
| as.factor(Neighborhood)Veenker | -133682 | 95439 | -1.401 | 0.1615 |
| as.factor(OverallScore)2 | 52988 | 39148 | 1.354 | 0.1761 |
| as.factor(OverallScore)2.5 | 31556 | 30040 | 1.05 | 0.2937 |
| as.factor(OverallScore)3 | 9263 | 29503 | 0.314 | 0.7536 |
| as.factor(OverallScore)3.5 | 39540 | 27525 | 1.437 | 0.1511 |
| as.factor(OverallScore)4 | 34736 | 27009 | 1.286 | 0.1986 |
| as.factor(OverallScore)4.5 | 44353 | 26723 | 1.66 | 0.09719 |
| as.factor(OverallScore)5 | 52622 | 26649 | 1.975 | 0.04851 |
| as.factor(OverallScore)5.5 | 58594 | 26633 | 2.2 | 0.02797 |
| as.factor(OverallScore)6 | 64095 | 26655 | 2.405 | 0.01632 |
| as.factor(OverallScore)6.5 | 73519 | 26708 | 2.753 | 0.005988 |
| as.factor(OverallScore)7 | 90897 | 26805 | 3.391 | 0.0007159 |
| as.factor(OverallScore)7.5 | 96859 | 27250 | 3.554 | 0.0003916 |
| as.factor(OverallScore)8 | 116979 | 27890 | 4.194 | 2.91e-05 |
| as.factor(OverallScore)8.5 | 119578 | 31069 | 3.849 | 0.0001241 |
| as.factor(OverallScore)9.5 | 245998 | 33922 | 7.252 | 6.806e-13 |
| TotalSF:NeighborhoodBlueste | -9.242 | 82.49 | -0.112 | 0.9108 |
| TotalSF:NeighborhoodBrDale | -20.7 | 32.61 | -0.6349 | 0.5256 |
| TotalSF:NeighborhoodBrkSide | -22.76 | 24.5 | -0.9288 | 0.3531 |
| TotalSF:NeighborhoodClearCr | -12.99 | 25.71 | -0.5051 | 0.6135 |
| TotalSF:NeighborhoodCollgCr | -6.758 | 24.06 | -0.2809 | 0.7789 |
| TotalSF:NeighborhoodCrawfor | -12.4 | 24.3 | -0.5104 | 0.6098 |
| TotalSF:NeighborhoodEdwards | -54.87 | 23.95 | -2.291 | 0.02212 |
| TotalSF:NeighborhoodGilbert | -7.09 | 24.67 | -0.2874 | 0.7738 |
| TotalSF:NeighborhoodIDOTRR | -13.37 | 25.76 | -0.5189 | 0.6039 |
| TotalSF:NeighborhoodMeadowV | -40.73 | 25.58 | -1.592 | 0.1116 |
| TotalSF:NeighborhoodMitchel | -23.7 | 24.44 | -0.9698 | 0.3323 |
| TotalSF:NeighborhoodNAmes | -33.03 | 24.01 | -1.376 | 0.169 |
| TotalSF:NeighborhoodNoRidge | 36.19 | 24.23 | 1.493 | 0.1356 |
| TotalSF:NeighborhoodNPkVill | -37.68 | 68.07 | -0.5535 | 0.58 |
| TotalSF:NeighborhoodNridgHt | 33.52 | 24.22 | 1.384 | 0.1666 |
| TotalSF:NeighborhoodNWAmes | -28.27 | 24.53 | -1.152 | 0.2493 |
| TotalSF:NeighborhoodOldTown | -33.77 | 24.11 | -1.401 | 0.1614 |
| TotalSF:NeighborhoodSawyer | -32.9 | 24.51 | -1.343 | 0.1796 |
| TotalSF:NeighborhoodSawyerW | -9.833 | 24.27 | -0.4051 | 0.6855 |
| TotalSF:NeighborhoodSomerst | 1.446 | 24.27 | 0.05956 | 0.9525 |
| TotalSF:NeighborhoodStoneBr | 51.08 | 24.69 | 2.069 | 0.03873 |
| TotalSF:NeighborhoodSWISU | -35.39 | 24.99 | -1.416 | 0.157 |
| TotalSF:NeighborhoodTimber | 4.267 | 24.66 | 0.173 | 0.8627 |
| TotalSF:NeighborhoodVeenker | 45.63 | 28.78 | 1.586 | 0.113 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1460 | 26353 | 0.8954 | 0.89 |
Now, we will validate our model. The verification of this model will help us know that the model fit on this one sample of data will continue to fit well on a new sample of data. This will be verified through the Validation Adjusted \(R^2\). This is calculated with the code below and presented with the other \(R^2\) values for comparison:
set.seed(12242003)
num_rows <- min(1000, nrow(train)) #1460 total
keep <- sample(1:nrow(train), num_rows)
mytrain <- train[keep, ] #Use this in the lm(..., data=mytrain) it is like "rbdata"
mytest <- train[-keep, ] #Use this in the predict(..., newdata=mytest) it is like "rbdata2"
# lms to check w/ mytrain for data =
house.lm <- lm(SalePrice ~ TotalSF + LocationScore + UtilityScore + TimeRemodel + Neighborhood + Neighborhood:TotalSF + OverallScore, data=mytrain)
# predict codes w/ mytest for newdata=
yh_myhouse <- predict(house.lm, newdata=mytest)
if (sum(is.na(yh_myhouse)) > 0) {
yh_myhouse[is.na(yh_myhouse)] <- mean(mytrain$SalePrice, na.rm = TRUE) # Replace NAs with mean
}
ybar <- mean(mytest$SalePrice)
SSTO <- sum( (mytest$SalePrice - ybar)^2 )
SSE_myhouse <- sum( (mytest$SalePrice - yh_myhouse)^2 )
rs_hd <- 1 - SSE_myhouse/SSTO
n <- nrow(mytest)
p_myhouse <- length(house.lm$coefficients)
rsa_myhouse <- 1 - (n-1)/(n-p_myhouse)*SSE_myhouse/SSTO
house.table <- data.frame(`Original R2` = summary(house.lm)$r.squared, `Orig. Adj. R-squared` = summary(house.lm)$adj.r.squared, `Validation R-squared` = rs_hd, `Validation Adj. R^2` = rsa_myhouse)
colnames(house.table) <- c("Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")
knitr::kable(house.table, escape=TRUE, digits=4)
| Original \(R^2\) | Original Adj. \(R^2\) | Validation \(R^2\) | Validation Adj. \(R^2\) |
|---|---|---|---|
| 0.8929 | 0.887 | 0.8739 | 0.8574 |
As we can see, the drop from the Original Adjusted \(R^2\) to the Validation Adjusted \(R^2\) goes from 0.8870 to 0.8574. Thus, with a difference of just 0.0296, we can see that the model captures the essence of the data fairly well and shows no signs of over fitting.
plot(SalePrice ~ ., data=train2)
b <- coef(househd) b
paste0(“b[“, 1:length(b),”]*“, names(b), collapse=”+“)
ggplot(househd, aes(x=GarageArea, y=SalePrice, color=interaction(Alley,FullBath))) + geom_point() + stat_function(fun=function(GarageArea, TotalSF=1200, LotArea=10000, AlleyNone=1, AlleyPave=0, FullBath=2, ScreenPorch=0) b[1]+b[2]TotalSF+b[3]LotArea+b[4]GarageArea+b[5]AlleyNone+b[6]AlleyPave+b[7]FullBath+b[8]*ScreenPorch) + facet_wrap(~interaction(Alley,FullBath)) househd <- lm(SalePrice ~ TotalSF + LotArea + GarageArea + Alley + FullBath + ScreenPorch, data=train) summary(househd)